#############################################################
### Geographic Representativeness of Legislatures Project ###
###                                                       ###
###           Leonardo Carella, Andrew Eggers             ###
#############################################################

# This is the THIRD script to run to replicate the cross-country analysis. 
# It uses SURLI scores and covariates to visualise and to model 
# the relationship between country-level variables and spatial 
# un-representativeness of legislatures.


# It requires three datasets as inputs:

# 1. the surli_crosscountry_variables.rds file, containing covariates
# 2. cdfds_w_surlis_4_increments_2005.rds: real parliament and ‘fake parliament’
# discrepancy scores across 500 simulations (2005 benchmark)
# 3. cdfds_w_surlis_4_increments_birthyear.rds: real parliament and ‘fake parliament’
# discrepancy scores across 500 simulations (mean MP birthyear benchmark)

#### Packages ####

library(ggrepel)
library(tidyverse)
library(stargazer)

#### Loading and preparing data ####

df <- readRDS("surli_crosscountry_variables.rds")
surlibirth <- readRDS("cdfds_w_surlis_4_increments_birthyear.rds")
surli2005 <- readRDS("cdfds_w_surlis_4_increments_2005.rds")

surli2005$prop_surli2005 <- NA
surlibirth$prop_surlibirth <- NA
surli2005$zscore_surli2005 <- NA
surlibirth$zscore_surlibirth <- NA

for (i in 1:62)
{
  surli2005$prop_surli2005[i] <- surli2005$real_emd[i]/mean(unlist(surli2005$fake_emd[i]))
  surlibirth$prop_surlibirth[i] <- surlibirth$real_emd[i]/mean(unlist(surlibirth$fake_emd[i]))
  surli2005$zscore_surli2005[i] <- (surli2005$real_emd[i] - mean(unlist(surli2005$fake_emd[i])))/
    (sd(unlist(surli2005$fake_emd[i])))
  surlibirth$zscore_surlibirth[i] <- (surlibirth$real_emd[i] - mean(unlist(surlibirth$fake_emd[i])))/
    (sd(unlist(surlibirth$fake_emd[i])))
}

df <- left_join(df, surli2005 %>% 
                  dplyr::select(country, prop_surli2005, zscore_surli2005), by = "country")
df <- left_join(df, surlibirth %>% 
                  dplyr::select(country, prop_surlibirth, zscore_surlibirth), by = "country")

#### Visualisations ####

# Comparison between the two versions of SURLI (figure 2)

g <- ggplot(data = df, mapping = aes(x = prop_surli2005, y = prop_surlibirth, label = country_text_id)) 
g + geom_point() + geom_text_repel(stat = "identity", size = 2.75, show.legend = FALSE) + 
  theme_minimal() + geom_smooth(method = "lm", se = T, 
                                col = "black", lty = 3, size = 0.4, 
                                alpha = 0.15,show.legend = FALSE) + 
  xlab("SURLI (2005 population)") + ylab("SURLI (mean legislator birth year population)") + 
  ggtitle("Relationship between our main SURLI index (x-axis)
and the alternative measure (y-axis)") + theme(plot.title = element_text(hjust = 0.5))

ggsave("surli_comparison_scatterplot.jpg", width = 8.5, height = 8.5)

# SURLI scores by electoral system family (figure 3)

df1 <- df %>%
  pivot_longer(cols = c(prop_surli2005, prop_surlibirth), names_to = "type", 
               values_to = "mean_surli")

df1$type[df1$type == "prop_surlibirth"] <- "SURLI (mean legislator birth year population)"
df1$type[df1$type == "prop_surli2005"] <- "SURLI (2005 population)"

g <- ggplot(data = df1, mapping = aes(x = major_es_categories, 
                                      group = major_es_categories, 
                                      color = major_es_categories, 
                                      fill = major_es_categories,
                                      shape = major_es_categories,
                                      y = mean_surli))


g  + theme_minimal() +
  geom_boxplot(alpha = 0.2) + 
  geom_point(position = position_dodge(width = 0.5)) + 
  facet_wrap(~type) + scale_color_manual(values = c("tomato", "orange",
                                                    "royalblue", "violet"),
                                         labels = c("Mixed-Member",
                                                    "Multi-Member with 
preferential voting",
                                                    "Multi-Member without 
preferential voting",
                                                    "Single-Member"),
                                         name = "Electoral Systems") +
  scale_shape_manual(values = c(21,22,24,25),
                     labels = c("Mixed-Member",
                                "Multi-Member with 
preferential voting",
                                "Multi-Member without 
preferential voting",
                                "Single-Member"),
                     name = "Electoral Systems") +   
  scale_fill_manual(values = c("tomato", "orange",
                               "royalblue", "violet"),
                    labels = c("Mixed-Member",
                               "Multi-Member with 
preferential voting",
                               "Multi-Member without 
preferential voting",
                               "Single-Member"),
                    name = "Electoral Systems") +
  scale_x_discrete(labels = c("MXM",
                              "MTM, 
with PV",
                              "MTM, 
no PV",
                              "SM")) + 
  theme(legend.position = "bottom") + 
  xlab("") + ylab("SURLI") + theme(strip.text = element_text(size = 12)) + 
  ggtitle("SURLI by Constituency and Ballot Structure") + 
  theme(plot.title = element_text(hjust = 0.5, size = 16),
        axis.text.x = element_text(size = 11),
        legend.text = element_text(size = 10)) + 
  guides(fill=guide_legend(nrow=2,byrow=TRUE),
         col=guide_legend(nrow=2,byrow=TRUE))

ggsave("boxplots_for_paper_29102022.jpg", width = 11, height = 9)

# SURLI scores by median DM (figure 4)

df2 <- df %>%
  pivot_longer(cols = c(prop_surlibirth, prop_surli2005), names_to = "type", 
               values_to = "value")

df2$type[df2$type == "prop_surli2005"] <- "SURLI (2005 population)"
df2$type[df2$type == "prop_surlibirth"] <- "SURLI (mean legislator birth year population)"

g1 <- ggplot(data = df2, mapping = aes(x = median_nat_ter, 
                                       color = major_es_categories,
                                       shape = major_es_categories,
                                       fill = major_es_categories,
                                       group = NA,
                                       y = value,
                                       label = country_text_id))

g1 + geom_point() + geom_text_repel(stat = "identity", size = 2.75, show.legend = FALSE) + 
  theme_minimal() + geom_smooth(method = "lm", se = T, 
                                col = "black", lty = 3, size = 0.4, 
                                alpha = 0.15,show.legend = FALSE) +
  scale_color_manual(values = c("tomato", "orange",
                                "royalblue", "violet"),
                     labels = c("Mixed-Member",
                                "Multi-Member with 
preferential voting",
                                "Multi-Member without 
preferential voting",
                                "Single-Member"),
                     name = "Electoral Systems") + 
  scale_shape_manual(values = c(21,22,24,25),
                     labels = c("Mixed-Member",
                                "Multi-Member with 
preferential voting",
                                "Multi-Member without 
preferential voting",
                                "Single-Member"),
                     name = "Electoral Systems") +   
  scale_fill_manual(values = c("tomato", "orange",
                               "royalblue", "violet"),
                    labels = c("Mixed-Member",
                               "Multi-Member with 
preferential voting",
                               "Multi-Member without 
preferential voting",
                               "Single-Member"),
                    name = "Electoral Systems") +
  xlab("Median District Magnitude (log)") + 
  ylab("SURLI")+ 
  theme(legend.position = "bottom") +
  guides(fill=guide_legend(nrow=2,byrow=TRUE),
         col=guide_legend(nrow=2,byrow=TRUE)) + 
  facet_wrap(~type,strip.position="top") + 
  ggtitle("SURLI by district magnitude") +
  scale_x_log10() + 
  theme(strip.text = element_text(size = 11),
        axis.text.y = element_text(size = 11),
        plot.title = element_text(hjust = 0.5, size = 16)) 

ggsave("scatterplots_for_paper_29102022.jpg", width = 11, height = 9)

# SURLI scores by median DM (figure S4 in supplementary material)

g1 <- ggplot(data = df2, mapping = aes(x = mean_nat_ter, 
                                       color = major_es_categories,
                                       shape = major_es_categories,
                                       fill = major_es_categories,
                                       group = NA,
                                       y = value,
                                       label = country_text_id))

g1 + geom_point() + geom_text_repel(stat = "identity", size = 2.75, show.legend = FALSE) + 
  theme_minimal() + geom_smooth(method = "lm", se = T, 
                                col = "black", lty = 3, size = 0.4, 
                                alpha = 0.15,show.legend = FALSE) +
  scale_color_manual(values = c("tomato", "orange",
                                "royalblue", "violet"),
                     labels = c("Mixed-Member",
                                "Multi-Member with 
preferential voting",
                                "Multi-Member without 
preferential voting",
                                "Single-Member"),
                     name = "Electoral Systems") + 
  scale_shape_manual(values = c(21,22,24,25),
                     labels = c("Mixed-Member",
                                "Multi-Member with 
preferential voting",
                                "Multi-Member without 
preferential voting",
                                "Single-Member"),
                     name = "Electoral Systems") +   
  scale_fill_manual(values = c("tomato", "orange",
                               "royalblue", "violet"),
                    labels = c("Mixed-Member",
                               "Multi-Member with 
preferential voting",
                               "Multi-Member without 
preferential voting",
                               "Single-Member"),
                    name = "Electoral Systems") +
  xlab("Mean District Magnitude (log)") + 
  ylab("SURLI")+ 
  theme(legend.position = "bottom") +
  guides(fill=guide_legend(nrow=2,byrow=TRUE),
         col=guide_legend(nrow=2,byrow=TRUE)) + 
  facet_wrap(~type,strip.position="top") + 
  ggtitle("SURLI by district magnitude") +
  scale_x_log10() + 
  theme(strip.text = element_text(size = 11),
        axis.text.y = element_text(size = 11),
        plot.title = element_text(hjust = 0.5, size = 16)) 

ggsave("scatterplots_for_paper_with_mean_DM.jpg", width = 11, height = 9)

#### Cross-country regression models ####

# Models with median DM, categorical constituency structure

fit1a <- lm(data = df, prop_surli2005 ~ 
              major_es_categories2 +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc) + 
              seats +
              dem_score )
summary(fit1a)

fit1b <- lm(data = df, prop_surli2005 ~ 
              major_es_categories2 +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc) + 
              seats +
              federalism_roeder )
summary(fit1b)

fit1c <- lm(data = df, prop_surli2005 ~ 
              major_es_categories2 +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc) + 
              seats +
              spatial_gini_lessmann )
summary(fit1c)

fit1d <- lm(data = df, prop_surli2005 ~ 
              major_es_categories2 +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc) + 
              seats +
              dem_score)
summary(fit1d)

fit1e <- lm(data = df, prop_surli2005 ~ 
              major_es_categories2 +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc) + 
              seats + 
              federalism_roeder)
summary(fit1e)

fit1f <- lm(data = df, prop_surli2005 ~ 
              major_es_categories2 +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+ 
              seats +
              spatial_gini_lessmann)
summary(fit1f)

# Models with mean DM, categorical constituency structure

fit2a <- lm(data = df, prop_surlibirth ~ 
              major_es_categories2 +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc) + 
              seats +
              dem_score)
summary(fit2a)


fit2b <- lm(data = df, prop_surlibirth ~ 
              major_es_categories2 +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+ 
              seats +
              federalism_roeder)
summary(fit2b)

fit2c <- lm(data = df, prop_surlibirth ~ 
              major_es_categories2 +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+ 
              seats+
              spatial_gini_lessmann)
summary(fit2c)

fit2d <- lm(data = df, prop_surlibirth ~ 
              major_es_categories2 +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+ 
              seats+
              dem_score)
summary(fit2d)


fit2e <- lm(data = df, prop_surlibirth ~ 
              major_es_categories2 +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+ 
              seats+
              federalism_roeder)
summary(fit2e)

fit2f <- lm(data = df, prop_surlibirth ~ 
              major_es_categories2 +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+ 
              seats+
              spatial_gini_lessmann)
summary(fit2f)

# Models with median DM, continuous constituency structure + dummy

fit3a <- lm(data = df, prop_surli2005 ~ 
              share_mmd + mixed +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              dem_score)
summary(fit3a)

fit3b <- lm(data = df, prop_surli2005 ~ 
              share_mmd + mixed +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              federalism_roeder)
summary(fit3b)

fit3c <- lm(data = df, prop_surli2005 ~ 
              share_mmd + mixed +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              spatial_gini_lessmann)
summary(fit3c)

fit3d <- lm(data = df, prop_surli2005 ~ 
              share_mmd + mixed +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              dem_score)
summary(fit3d)

fit3e <- lm(data = df, prop_surli2005 ~ 
              share_mmd + mixed +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              federalism_roeder)
summary(fit3e)

fit3f <- lm(data = df, prop_surli2005 ~ 
              share_mmd + mixed +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc) +
              seats + 
              spatial_gini_lessmann)
summary(fit3f)

# Models with mean DM, continuous constituency structure + dummy

fit4a <- lm(data = df, prop_surlibirth ~ 
              share_mmd + mixed +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              dem_score)
summary(fit4a)

fit4b <- lm(data = df, prop_surlibirth ~ 
              share_mmd + mixed +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              federalism_roeder)
summary(fit4b)

fit4c <- lm(data = df, prop_surlibirth ~ 
              share_mmd + mixed +
              pv_cat +
              log(median_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              spatial_gini_lessmann)
summary(fit4c)


fit4d <- lm(data = df, prop_surlibirth ~ 
              share_mmd + mixed +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              dem_score)
summary(fit4d)

fit4e <- lm(data = df, prop_surlibirth ~ 
              share_mmd + mixed +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              federalism_roeder)
summary(fit4e)

fit4f <- lm(data = df, prop_surlibirth ~ 
              share_mmd + mixed +
              pv_cat +
              log(mean_nat_ter) + 
              log(population) + 
              log(land_area) + 
              log(gdp_pc)+
              seats + 
              spatial_gini_lessmann)
summary(fit4f)

# main model, exclude micro-countries

fit1a_sub <- lm(data = subset(df, n_grids > 20), prop_surli2005 ~ 
                  major_es_categories2 +
                  pv_cat +
                  log(median_nat_ter) + 
                  log(population) + 
                  log(land_area) + 
                  log(gdp_pc)+
                  seats + 
                  dem_score)
summary(fit1a_sub)

fit2a_sub <- lm(data = subset(df, n_grids > 20), prop_surlibirth ~ 
                  major_es_categories2 +
                  pv_cat +
                  log(median_nat_ter) + 
                  log(population) + 
                  log(land_area) + 
                  log(gdp_pc)+
                  seats + 
                  dem_score)
summary(fit2a_sub)

fit3a_sub <- lm(data = subset(df, n_grids > 20), prop_surli2005 ~ 
                  share_mmd + mixed +
                  pv_cat +
                  log(median_nat_ter) + 
                  log(population) + 
                  log(land_area) + 
                  log(gdp_pc)+
                  seats + 
                  dem_score)
summary(fit3a_sub)

fit4a_sub <- lm(data = subset(df, n_grids > 20), prop_surlibirth ~ 
                  share_mmd + mixed +
                  pv_cat +
                  log(median_nat_ter) + 
                  log(population) + 
                  log(land_area) + 
                  log(gdp_pc)+
                  seats + 
                  dem_score)
summary(fit4a_sub)

# main model, exclude USA

fit1a_nous <- lm(data = subset(df, country != "USA"), prop_surli2005 ~ 
                   major_es_categories2 +
                   pv_cat +
                   log(median_nat_ter) + 
                   log(population) + 
                   log(land_area) + 
                   log(gdp_pc)+
                   seats +
                   dem_score)
summary(fit1a_nous)

fit2a_nous <- lm(data = subset(df, country != "USA"), prop_surlibirth ~ 
                   major_es_categories2 +
                   pv_cat +
                   log(median_nat_ter) + 
                   log(population) + 
                   log(land_area) + 
                   log(gdp_pc)+
                   seats +
                   dem_score)
summary(fit2a_nous)

fit3a_nous <- lm(data = subset(df, country != "USA"), prop_surli2005 ~ 
                   share_mmd + mixed +
                   pv_cat +
                   log(median_nat_ter) + 
                   log(population) + 
                   log(land_area) + 
                   log(gdp_pc)+
                   seats +
                   dem_score)
summary(fit3a_nous)

fit4a_nous <- lm(data = subset(df, country != "USA"), prop_surlibirth ~ 
                   share_mmd + mixed +
                   pv_cat +
                   log(median_nat_ter) + 
                   log(population) + 
                   log(land_area) + 
                   log(gdp_pc)+
                   seats +
                   dem_score)
summary(fit4a_nous)

# main model, control for presidentialism

fit1a_prez <- lm(data = df, prop_surli2005 ~ 
                   major_es_categories2 +
                   pv_cat +
                   log(median_nat_ter) + 
                   log(population) + 
                   log(land_area) + 
                   log(gdp_pc)+
                   seats +
                   dem_score + 
                   presidential)
summary(fit1a_prez)

fit2a_prez <- lm(data = df, prop_surlibirth ~ 
                   major_es_categories2 +
                   pv_cat +
                   log(median_nat_ter) + 
                   log(population) + 
                   log(land_area) + 
                   log(gdp_pc)+
                   seats +
                   dem_score + 
                   presidential)
summary(fit2a_prez)

fit3a_prez <- lm(data = df, prop_surli2005 ~ 
                   share_mmd + mixed +
                   pv_cat +
                   log(median_nat_ter) + 
                   log(population) + 
                   log(land_area) + 
                   log(gdp_pc)+
                   seats +
                   dem_score + 
                   presidential)
summary(fit3a_prez)

fit4a_prez <- lm(data = df, prop_surlibirth ~ 
                   share_mmd + mixed +
                   pv_cat +
                   log(median_nat_ter) + 
                   log(population) + 
                   log(land_area) + 
                   log(gdp_pc)+
                   seats +
                   dem_score + 
                   presidential)
summary(fit4a_prez)

# main model, interact PV and DM

fit1a_interact <- lm(data = df, prop_surli2005 ~ 
                       major_es_categories2 +
                       pv_cat*log(median_nat_ter) + 
                       log(population) + 
                       log(land_area) + 
                       log(gdp_pc)+
                       seats +
                       dem_score)
summary(fit1a_interact)

fit2a_interact <- lm(data = df, prop_surlibirth ~ 
                       major_es_categories2 +
                       pv_cat*log(median_nat_ter) + 
                       log(population) + 
                       log(land_area) + 
                       log(gdp_pc)+
                       seats +
                       dem_score)
summary(fit2a_interact)

fit3a_interact <- lm(data = df, prop_surli2005 ~ 
                       share_mmd + mixed +
                       pv_cat*log(median_nat_ter) + 
                       log(population) + 
                       log(land_area) + 
                       log(gdp_pc)+
                       seats +
                       dem_score)
summary(fit3a_interact)

fit4a_interact <- lm(data = df, prop_surlibirth ~ 
                       share_mmd + mixed +
                       pv_cat*log(median_nat_ter) + 
                       log(population) + 
                       log(land_area) + 
                       log(gdp_pc)+
                       seats +
                       dem_score)
summary(fit4a_interact)

# main model, number of districts used instead of DM

fit1a_nodist <- lm(data = df, prop_surli2005 ~ 
                     major_es_categories2 +
                     pv_cat +
                     no_districts + 
                     log(population) + 
                     log(land_area) + 
                     log(gdp_pc)+
                     seats +
                     dem_score)
summary(fit1a_nodist)

fit2a_nodist <- lm(data = df, prop_surlibirth ~ 
                     major_es_categories2 +
                     pv_cat +
                     no_districts + 
                     log(population) + 
                     log(land_area) + 
                     log(gdp_pc)+
                     seats +
                     dem_score)
summary(fit2a_nodist)

fit3a_nodist <- lm(data = df, prop_surli2005 ~ 
                     share_mmd + mixed +
                     pv_cat +
                     no_districts + 
                     log(population) + 
                     log(land_area) + 
                     log(gdp_pc)+
                     seats +
                     dem_score)
summary(fit3a_nodist)

fit4a_nodist <- lm(data = df, prop_surlibirth ~ 
                     share_mmd + mixed +
                     pv_cat +
                     no_districts + 
                     log(population) + 
                     log(land_area) + 
                     log(gdp_pc)+
                     seats +
                     dem_score)
summary(fit4a_nodist)

# main model, control for residency requirements 
# (USA, Argentina, Brazil, Chile, Taiwan, Ecuador)

fit1a_residency <- lm(data = df, prop_surli2005 ~ 
                        major_es_categories2 +
                        pv_cat +
                        log(median_nat_ter) + 
                        log(population) + 
                        log(land_area) + 
                        log(gdp_pc)+
                        seats +
                        dem_score + 
                        residency)
summary(fit1a_residency)

fit2a_residency <- lm(data = df, prop_surlibirth ~ 
                        major_es_categories2 +
                        pv_cat +
                        log(median_nat_ter) + 
                        log(population) + 
                        log(land_area) + 
                        log(gdp_pc)+
                        seats +
                        dem_score + 
                        residency)
summary(fit2a_residency)

fit3a_residency <- lm(data = df, prop_surli2005 ~ 
                        share_mmd + mixed +
                        pv_cat +
                        log(median_nat_ter) + 
                        log(population) + 
                        log(land_area) + 
                        log(gdp_pc)+
                        seats +
                        dem_score + 
                        residency)
summary(fit3a_residency)

fit4a_residency <- lm(data = df, prop_surlibirth ~ 
                        share_mmd + mixed +
                        pv_cat +
                        log(median_nat_ter) + 
                        log(population) + 
                        log(land_area) + 
                        log(gdp_pc)+
                        seats +
                        dem_score + 
                        residency)
summary(fit4a_residency)

# main model, logged DV

fit1a_logged <- lm(data = df, log10(prop_surli2005) ~ 
                     major_es_categories2 +
                     pv_cat +
                     log(median_nat_ter) + 
                     log(population) + 
                     log(land_area) + 
                     log(gdp_pc)+
                     seats +
                     dem_score 
)
summary(fit1a_logged)

fit2a_logged <- lm(data = df, log10(prop_surlibirth) ~ 
                     major_es_categories2 +
                     pv_cat +
                     log(median_nat_ter) + 
                     log(population) + 
                     log(land_area) + 
                     log(gdp_pc)+
                     seats +
                     dem_score
)
summary(fit2a_logged)

fit3a_logged <- lm(data = df, log10(prop_surli2005) ~ 
                     share_mmd + mixed +
                     pv_cat +
                     log(median_nat_ter) + 
                     log(population) + 
                     log(land_area) + 
                     log(gdp_pc)+
                     seats +
                     dem_score
)
summary(fit3a_logged)

fit4a_logged <- lm(data = df, log10(prop_surlibirth) ~ 
                     share_mmd + mixed +
                     pv_cat +
                     log(median_nat_ter) + 
                     log(population) + 
                     log(land_area) + 
                     log(gdp_pc)+
                     seats +
                     dem_score)
summary(fit4a_logged)

# main model, z-score DV

fit1a_normalised <- lm(data = df, (zscore_surli2005) ~ 
                         major_es_categories2 +
                         pv_cat +
                         log(median_nat_ter) + 
                         log(population) + 
                         log(land_area) + 
                         log(gdp_pc)+
                         seats +
                         dem_score 
)
summary(fit1a_normalised)

fit2a_normalised <- lm(data = df, (zscore_surlibirth) ~ 
                         major_es_categories2 +
                         pv_cat +
                         log(median_nat_ter) + 
                         log(population) + 
                         log(land_area) + 
                         log(gdp_pc)+
                         seats +
                         dem_score
)
summary(fit2a_normalised)

fit3a_normalised <- lm(data = df, (zscore_surli2005) ~ 
                         share_mmd + mixed +
                         pv_cat +
                         log(median_nat_ter) + 
                         log(population) + 
                         log(land_area) + 
                         log(gdp_pc)+
                         seats +
                         dem_score
)
summary(fit3a_normalised)

fit4a_normalised <- lm(data = df, (zscore_surlibirth) ~ 
                         share_mmd + mixed +
                         pv_cat +
                         log(median_nat_ter) + 
                         log(population) + 
                         log(land_area) + 
                         log(gdp_pc)+
                         seats +
                         dem_score)
summary(fit4a_normalised)

# main model, only assembly size, no population control

fit1a_nopop <- lm(data = df, prop_surli2005 ~ 
                    major_es_categories2 +
                    pv_cat +
                    log(median_nat_ter) + 
                    log(land_area) + 
                    log(gdp_pc)+
                    seats +
                    dem_score)
summary(fit1a_nopop)

fit2a_nopop <- lm(data = df, prop_surlibirth ~ 
                    major_es_categories2 +
                    pv_cat +
                    log(median_nat_ter) + 
                    log(land_area) + 
                    log(gdp_pc)+
                    seats +
                    dem_score)
summary(fit2a_nopop)

fit3a_nopop <- lm(data = df, prop_surli2005 ~ 
                    share_mmd + mixed +
                    pv_cat +
                    log(median_nat_ter) + 
                    log(land_area) + 
                    log(gdp_pc)+
                    seats +
                    dem_score)
summary(fit3a_nopop)

fit4a_nopop <- lm(data = df, prop_surlibirth ~ 
                    share_mmd + mixed +
                    pv_cat +
                    log(median_nat_ter) + 
                    log(land_area) + 
                    log(gdp_pc)+
                    seats +
                    dem_score)
summary(fit4a_nopop)

#### Regression Tables ####

# Main model (table 3)

stargazer(fit1a, fit3a, fit2a, fit4a,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = c("SURLI (2005 benchmark)", 
                             "SURLI (mean legislator birth year benchmark)"), 
          type = "latex")

# Supplementary Material, Table S1, alternative specifications of model 1 in table 3

stargazer(fit1b, fit1c, fit1d, fit1e, fit1f,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = "SURLI (2005 benchmark)", type = "latex")

# Supplementary Material, Table S2, alternative specifications of model 2 in table 3

stargazer(fit3b, fit3c, fit3d, fit3e, fit3f,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = "SURLI (2005 benchmark)", type = "latex")

# Supplementary Material, Table S3, alternative specifications of model 3 in table 3

stargazer(fit2b, fit2c, fit2d, fit2e, fit2f,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = "SURLI (mean legislator birth year benchmark)", 
          type = "latex")

# Supplementary Material, Table S4, alternative specifications of model 4 in table 3

stargazer(fit4b, fit4c, fit4d, fit4e, fit4f,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = "SURLI (mean legislator birth year benchmark)", type = "latex")

# Supplementary Material, Table S5, main model excluding micro-countries

stargazer(fit1a_sub, fit3a_sub, fit2a_sub, fit4a_sub,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = c("SURLI (2005 benchmark)", "SURLI (mean legislator birth year benchmark)"),
          type = "latex")

# Supplementary Material, Table S6, main model excluding US

stargazer(fit1a_nous, fit3a_nous, fit2a_nous, fit4a_nous,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = c("SURLI (2005 benchmark)", "SURLI (mean legislator birth year benchmark)"),
          type = "latex")

# Supplementary Material, Table S7, main model controlling for presidentialism

stargazer(fit1a_prez, fit3a_prez, fit2a_prez, fit4a_prez,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = c("SURLI (2005 benchmark)", 
                             "SURLI (mean legislator birth year benchmark)"), type = "latex")

# Supplementary Material, Table S8, main model w/ interaction of DM and PV

stargazer(fit1a_interact, fit3a_interact, fit2a_interact, fit4a_interact,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = c("SURLI (2005 benchmark)", "SURLI (mean legislator birth year benchmark)"),
          type = "latex")

# Supplementary Material, Table S9, number of districts used instead of median DM

stargazer(fit1a_nodist, fit3a_nodist, fit2a_nodist, fit4a_nodist,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = c("SURLI (2005 benchmark)", "SURLI (mean legislator birth year benchmark)"),
          type = "latex")

# Supplementary Material, Table S10, control for residency requirements

stargazer(fit1a_residency, fit3a_residency, fit2a_residency, fit4a_residency,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = c("SURLI (2005 benchmark)", 
                             "SURLI (mean legislator birth year benchmark)"), type = "latex")

# Supplementary Material, Table S11, logged dependent variable

stargazer(fit1a_logged, fit3a_logged, fit2a_logged, fit4a_logged,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = c("SURLI (2005 benchmark)", 
                             "SURLI (mean legislator birth year benchmark)"), type = "latex")

# Supplementary Material, Table S12, ‘old’ dependent variable (z-score rather than ratio)

stargazer(fit1a_normalised, fit3a_normalised, fit2a_normalised, fit4a_normalised,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = c("SURLI (2005 benchmark)", 
                             "SURLI (mean legislator birth year benchmark)"), type = "latex")

# Supplementary Material, Table S13, main model without logged population control

stargazer(fit1a_nopop, fit3a_nopop, fit2a_nopop, fit4a_nopop,
          single.row = T, font.size = "footnotesize", digits = 2, 
          dep.var.labels = c("SURLI (2005 benchmark)", 
                             "SURLI (mean legislator birth year benchmark)"), type = "latex")
